import csv
import datetime,calendar
import json
import re
import time
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver import ActionChains

def main():
    ''' 主逻辑函数 '''

    # 浏览器配置
    global driver

    try:
        # 查询log文件
        if input("是否读取日志继续爬取（1/0）：") == '1':
            read_log()
        jugesd()
    except Exception as e:
        print(e)
        print( e.__traceback__.tb_lineno)
        ''' 日志记录：ERROR：webdriver启动失败+当前时间 '''
        record_log_exception('webdriver启动失败',e)
        print('重新执行')
        driver.quit()

    finally:
        logs.write('[DATA]\t{}\n')
        # 记录当前爬取的主题，爬取时间，爬取页码
        logs.flush()

def creat_driver():
    ''' 创建driver '''

    print('创建driver')
    option = webdriver.ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])  # webdriver防检测
    # option.add_argument('--headless')
    option.add_argument("--disable-blink-features=AutomationControlled")
    option.add_argument("--no-sandbox")
    option.add_argument("--disable-dev-usage")
    option.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
    driver = webdriver.Chrome(executable_path='./chromedriver.exe', options=option)
    driver.set_page_load_timeout(15)
    return driver

def jugesd():
    ''' 爬取网站逻辑函数 '''

    global num
    global p_now
    global start_time
    global end_time
    global now_time
    global type_list
    global type_now
    global current_url
    global key_num
    global log_p
    global driver

    step = 3
    # 爬取步长

    my_cookie = login( 'nc_1_n1z')
    # 登陆
    headers['cookie'] = my_cookie
    # 获取到cookie

    print('登陆成功')

    logs.write('[INFO]\t登陆成功\t当前时间：{}\n'.format(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())))
    logs.flush()
    ''' 实现登陆，获取cookie '''

    ''' 根据种类顺序爬取数据 '''
    for key in type_list:
        # 分析数据和页数
        type_now = key
        # 保存当前主题
        year = start_time[0]
        month = start_time[1]
        day = start_time[2]

        print("开始爬取{}当前爬取时间{}".format(key, year))

        while True:
            if str(year) == datetime.date.today().strftime('%Y') and month-1 == int(datetime.date.today().strftime('%m')):
                # 当爬取日期等于当前日期停止
                print("{}爬取结束".format(type_now))
                break

            if len(end_time) == 2:
                if year == end_time[0] and month == end_time[1]:
                    # 根据选定的结束日期结束
                    print("{}爬取结束".format(type_now))
                    break

            if month == 12:
                year += 1
                month = 1

            if day+step >= calendar.monthrange(year, month)[1]:
                day = 1
                month += 1
            now_time = [year, month, day]

            start = datetime.date(year,month,day).strftime('%Y%m%d')
            end = datetime.date(year,month,day+step).strftime('%Y%m%d')
            day += step
            print('当前时间段{}-{}'.format(start,end))
            # 跟新now_time

            ''' 爬取当前筛选总条数和页数 '''

            try:
                datas = get_data(1, start, end, key)

            except Exception as e:
                # 如果异常就刷新
                print('get_data():',e)
                driver.refresh()
                datas = get_data(1, start, end,key)

            if key_num == 0:
                print('当前日期没有合适数据')
                continue

            p_all = int(key_num / 20 + 1)
            # 将查到的总数据赋值给 key_num

            print('当前时间一共有数据{}，页数{}'.format(key_num, p_all))

            ''' 循环爬取当前查询 '''
            start_p = 1

            if log_p != -1:
                start_p = log_p
                log_p = -1
            # 保存当前页码

            for i in range(start_p, p_all + 1):
                p_now = i
                # 记录当前页数
                if i == 251 :
                    print('数据过多，剩余{}未能爬取，请调整step'.format(p_now-250))
                    ''' info '''
                    break

                print('当前获取页数：{}\t当前时间：{}-{}'.format(i,start,end))
                try:
                    datas = get_data(i,start,end,key )

                except Exception as e:
                    # 如果异常就刷新
                    print("获取数据异常get_data()",e)
                    record_log_exception('获取数据异常get_data()',e)
                    driver.refresh()
                    datas = get_data(i,start,end,key)

                parse_data(datas)

def login(id='nc_1_n1z'):
    ''' 登陆函数 '''

    global users_list
    global users_id
    global driver

    driver.delete_all_cookies()
    url = "https://www.qcc.com/user_login?back=%2F"

    driver.get(url)
    time.sleep(1)

    driver.find_element_by_id('normalLogin').click()  # 点击密码登入
    time.sleep(1)
    # 输入账号密码
    driver.find_element_by_id('nameNormal').send_keys(users_list[users_id]['username'])
    driver.find_element_by_id('pwdNormal').send_keys(users_list[users_id]['password'])
    users_id += 1
    time.sleep(1)
    if users_id == len(users_list):
        users_id = 0
    # 每次登陆后迭代users_id

    # button = driver.find_element_by_id(id)

    # 滑动滑块
    # ActionChains(driver).click_and_hold(button).perform()
    # ActionChains(driver).move_by_offset(xoffset=310, yoffset=0).perform()
    # ActionChains(driver).release().perform()
    for move in range(306, 310):
        try:
            button = driver.find_element_by_id(id)
            # 滑动滑块
            ActionChains(driver).click_and_hold(button).perform()
            ActionChains(driver).move_by_offset(xoffset=move, yoffset=0).perform()
            ActionChains(driver).release().perform()

            time.sleep(0.5)
        except:
            break


    time.sleep(3)
    driver.find_element_by_xpath('//button[@type="submit"]').click()
    time.sleep(5)

    # driver.get('https://www.qcc.com/web/search?key=%E5%8C%BB%E9%99%A2')
    # while '验证通过' not in driver.page_source:
    #     for move in range(300, 311):
    #         detail_verify( move, 'nc_1_n1z')
    #         time.sleep(1)
    #         driver.get(url)
    #         if '<div class="text-center regTab m-t-xl">您的操作过于频繁，验证后再操作</div>' in driver.page_source:
    #             print('滑动失败，操作过于频繁，请切换账号或稍候重新登陆')
    #         else:
    #             print('滑动成功')
    #             break

    while driver.current_url != 'https://www.qcc.com/':
        print('用户名或密码错误，请检查程序登陆！！！')
        driver.quit()
        driver = creat_driver()
        return login()

    the_cookie = driver.get_cookies()
    my_cookie = ''
    for cook in the_cookie:
        cookie = cook['name'] + '=' + cook['value'] + ';'
        my_cookie += cookie
    return my_cookie

def detail_verify(move, id_name):
    ''' 验证滑动函数 '''

    global driver

    # 获取滑块
    print('正在滑动验证码')
    driver.refresh()
    time.sleep(0.5)
    button = driver.find_element_by_id(id_name)
    # nc_1_n1t 登陆的滑块
    # nc_1_n1z 验证的滑块

    # 滑动滑块
    ActionChains(driver).click_and_hold(button).perform()
    ActionChains(driver).move_by_offset(xoffset=move, yoffset=0).perform()
    ActionChains(driver).release().perform()
    time.sleep(2)
    driver.find_element_by_xpath('//button[@id="verify"]').click()

def active_ver(url):
    ''' 主动验证函数 '''
    global driver
    global headers

    try:
        driver.get(url)
        time.sleep(10)
        data_var = driver.page_source
        #     验证页面
        if '您的操作过于频繁，验证后再操作' in data_var:
            # nc_1_n1z
            logs.write('[WARN]\t您的操作过于频繁，验证后再操作\t{}\n'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
            for move in range(300, 311):
                print('正在滑动验证码')
                for move in range(300, 311):
                    detail_verify(driver, move, 'nc_1_n1z')
                    time.sleep(1)
                    data = driver.get(url)
                    if '<div class="text-center regTab m-t-xl">您的操作过于频繁，验证后再操作</div>' in driver.page_source:
                        print('滑动失败，操作过于频繁，重新验证')
                        time.sleep(3)
                        active_ver(url)
                    else:
                        print('滑动成功')

        while '您的账号访问超频，请稍后访问或联系客服人员' in data_var:
            print('ip被封，请切换热点')
            logs.write('[WARN]\tip被封，请切换热点\t{}\n'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
            record_log_exception('ip或账号被封')
            ''' 重新登陆 '''
            driver.quit()
            print('退出driver')
            time.sleep(3)
            driver = creat_driver()
            time.sleep(1)
            my_cookie = login()
            # 登陆
            headers['cookie'] = my_cookie
            # 获取到cookie
            driver.get(url)
            time.sleep(10)
            data_var = driver.page_source

    except Exception as e:
        print("driver错误，可能被目标网站拒绝访问,切换网络或者请稍后")
        record_log_exception("driver错误，可能被目标网站拒绝访问,切换网络或者请稍后",e)
        time.sleep(10)
        active_ver(url)

def get_data(p,start,end,key):
    ''' 解析网页函数 '''

    global num
    global p_now
    global start_time
    global end_time
    global now_time
    global type_list
    global type_now
    global current_url
    global key_num
    global driver

    url = 'https://www.qcc.com/web/search?key={}&filter=%7B%22f%22%3A%5B%22T%22%5D,%22d%22%3A%5B%7B%22start%22%3A%22{}%22,%22end%22%3A%22{}%22,%22value%22%3A%22{}%22,%22x%22%3Atrue%7D%5D%7D&p={}'.format(
        key,str(start), str(end),str(str(start)+'-'+str(end)), p)
    # 这是根据成立年限时间进行筛选的
    url_var = 'https://www.qcc.com/index_verify?type=companysearch&back=/web/search?key={}&filter=%7B%22f%22%3A%5B%22T%22%5D,%22d%22%3A%5B%7B%22start%22%3A%22{}%22,%22end%22%3A%22{}%22,%22value%22%3A%22{}%22,%22x%22%3Atrue%7D%5D%7D&p={}'.format(
        key, str(start), str(end), str(str(start) + '-' + str(end)), p
    )
    # 验证url

    if num > 5000 :
        num = 0
        print('以获取5000千条数据，休息1分钟')
        logs.write('[INFO]\t已获取5000千条数据。数据类型：{}数据时间：{}\t{}\n'.format(type_now,now_time,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        for i in range(1):
            print(i,'s')
            time.sleep(1)

    try:
        data = requests.get(url, headers=headers,timeout=5).content.decode('utf-8')
    except Exception as e:
        print(e)
        record_log_exception("响应超时",e)
        print("响应超时")
        print("请检查网络，10秒后刷新")
        ''' info '''
        time.sleep(10)
        data = requests.get(url, headers=headers).content.decode('utf-8')


    while '小查为您找到' not in data:
        print('网页解析错误，主动验证')
        time.sleep(3)
        print('key_num',key_num)
        driver.get(url)
        time.sleep(1)
        active_ver(url_var)
        # 主动验证
        data = requests.get(url, headers=headers).content.decode('utf-8')

    key_num = int(re.findall('小查为您找到\\s+<.*?>(\\d+)<.*?>', data, re.S)[0])

    if key_num == 0:
        # 主动验证
        driver.get(url)
        time.sleep(1)
        active_ver(url_var)
        data = requests.get(url, headers=headers).content.decode('utf-8')

        key_num = int(re.findall('小查为您找到\\s+<.*?>(\\d+)<.*?>', data, re.S)[0])

    while '<a class="btn btn-primary" data-v-78d0dd48>立即登录</a>' in data:
        print("身份失效，重新登陆")

        logs.write('[WARN]\tcookie失效，重新登陆\t{}\n'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

        driver.quit()
        time.sleep(3)
        driver = creat_driver()
        my_cookie = login()
        headers['cookie'] = my_cookie

        time.sleep(3)
        data = requests.get(url, headers=headers, timeout=3).content.decode('utf-8')

    while '您的操作过于频繁，验证后再操作' in data:
        print("操作频繁")
        for move in range(300, 311):
            detail_verify(driver, move,'nc_1_n1t')
            time.sleep(1)
            driver.get(url)
            if '您的操作过于频繁，验证后再操作' in driver.page_source:
                print('滑动失败')
            else:
                print('滑动成功')
                data = requests.get(url, headers=headers, timeout=3).content.decode('utf-8')
                break

    return data

def parse_data(datas):
    ''' 预处理数据和保存数据函数 '''

    global num
    global p_now
    global start_time
    global end_time
    global now_time
    global type_list
    global type_now
    global current_url
    global key_num

    tree = etree.HTML(datas)
    tr_list = tree.xpath('/html/body//table[@class="ntable ntable-list"]/tr')

    data_list = []
    for tr in tr_list:
        name = tr.xpath('./td[3]/div/a/span//text()')
        person = tr.xpath('./td[3]/div/div[3]/div[1]/span[1]/span/span/a/text()')
        phone = tr.xpath('./td[3]/div/div[3]/div[2]/span[1]/span/span[2]/text()')
        money = tr.xpath('./td[3]/div/div[3]/div[1]/span[2]/span/text()')
        time_ = tr.xpath('./td[3]/div/div[3]/div[1]/span[3]/span/text()')
        name = ''.join(name)

        if len(name) == 0:
            print('数据name为空')
            name = ''
        if len(person) == 0:
            print('数据person为空')
            person = ['']
        if len(phone) == 0:
            print('数据phone为空')
            phone = [-1]
        if len(money) == 0:
            print('数据money为空')
            money = [0]
        if len(time_) == 0:
            time_ = ['00-00-00']
            print('数据time为空')

        try:
            data_list.append({'name':name,'person':person[0],'phone':phone[0],'money':money[0],'time':time_[0]})
        except Exception as e:
            print('爬取数据异常，请检查网页')
            print(e)
            record_log_exception('爬取数据异常，xpath解析错误，请检查网页',e)

    num = num + len(data_list)
    # 文件写入
    try:
        with open('./data/'+type_now+'.csv','a',newline='',encoding='utf-8') as f:
            writer = csv.DictWriter(f,['name','person','phone','money','time'])
            writer.writerows(data_list)
    except Exception as e:
        print('文件写入异常',e)
        record_log_exception('文件写入异常',e)
    return data_list

def record_log_exception(inf,e=None):
    ''' 日志记录函数 '''

    if e == None:
        log_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        logs.write('[ERROR]\t{}\t{}\n'.format(inf, log_time))
        data_json = json.dumps({"p_now": p_now, "now_time": now_time, "type_now": type_now})
        logs.write('[DATA]\t' + data_json + '\t{}\n'.format(log_time))
        # 记录当前爬取的主题，爬取时间，爬取页码
        logs.flush()
        return

    log_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    logs.write('[ERROR]\t{}-错误信息：{}-错误所在的行号：{}\t{}\n'.format(inf,e, e.__traceback__.tb_lineno, log_time))
    data_json = json.dumps({"p_now": p_now, "now_time": now_time, "type_now": type_now})
    logs.write('[DATA]\t' + data_json + '\t{}\n'.format(log_time))
    # 记录当前爬取的主题，爬取时间，爬取页码
    logs.flush()

def read_log(path = './logs/info.log'):
    ''' 读取日志函数 '''

    global start_time
    global log_p
    global type_list

    with open(path,'r',encoding='utf-8') as f:
        info_list = f.readlines()

    for i in range(len(info_list))[::-1]:
        if 'DATA' in info_list[i]:
            info_data = info_list[i].split('\t')[1]
            if 'now_time' in json.loads(info_data):
                if len(json.loads(info_data)['now_time']) != 0:
                    break

    info_data = json.loads(info_data)
    start_time = info_data['now_time']
    log_p = info_data['p_now']
    type_list = type_list[type_list.index(info_data['type_now']):]

    print("log:",info_data)

if __name__ == '__main__':

    users_list = [{'username':'19892951690','password':'3535234954nn'},
                  {'username':'18810217312','password':'inamlin0808'},
                  {'username':'18974652562','password':'1171460872nn'},
                  {'username':'13366321690','password':'19892951690nn'},
                  ]
    # 建议使用5到10个vip账号
    users_id = 0
    # 当前使用的用户id

    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36',
        'Host' : 'www.qcc.com'
    }
    # 设置请求头
    driver = creat_driver()
    # driver
    num = 1
    # 记录爬虫数据数量
    p_now = 0
    #当前页码
    start_time = [2021,1,1]
    # 开始时间，列表表示【年，月】
    end_time = []
    # 结束时间
    now_time = []
    # 当前爬取时间
    # type_list = ['制造业','科技','材料','智能','医药']
    type_list = ['制造业','科技','材料','智能','医药']
    # 爬取种类
    type_now = ''
    # 当前爬取种类
    current_url=''
    # 当前url，目标网站当前没有这种反爬机制
    key_num = 0
    # 当前页面的数据量
    log_p = -1
    # 读取log的p
    logs = open('./logs/info.log','a',encoding='utf-8')
    # 打开日志文件

    try:
        print("启动")
        main()
        print('结束')
    except Exception as e:
        print('错误所在的行号：', e.__traceback__.tb_lineno)
        print('错误信息', e)
        record_log_exception("出现严重错误",e)
    finally:
        logs.close()

# INFO